df_clean %>%
ggplot(aes(data_grant_share, data_sharing_perc, colour = pubs.vs.data)) +
geom_point(size = 1.5) +
scale_colour_viridis_c()
# why does the right side not go up higher? is this the base funding that
# stays stronger for those with not many grants?
p <- df_clean %>%
mutate(stats = paste("pubs:", n_publications, "grants:", total_grants)) %>%
ggplot(aes(data_grant_share, data_sharing_perc, colour = factor(pubs.vs.data),
label = stats)) +
geom_point(alpha = .5) +
scale_colour_viridis_d()
p
plotly::ggplotly(p)
Rewarding based on datasets simply adds more randomness.
df_clean %>%
ggplot(aes(data_grant_share, total_grants)) +
geom_point() +
facet_wrap(vars(pubs.vs.data)) +
labs(title = "Influence of incentive settings")
browser()
## Called from: eval(expr, envir, enclos)
m1 <- lm(n_publications ~ total_grants + data_grant_share,
data = df_clean)
summary(m1)
##
## Call:
## lm(formula = n_publications ~ total_grants + data_grant_share,
## data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -263.678 -11.631 0.196 11.898 215.479
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 151.876133 0.742424 204.568 <2e-16 ***
## total_grants 5.863183 0.001378 4255.650 <2e-16 ***
## data_grant_share -12.989140 1.398584 -9.287 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.18 on 11997 degrees of freedom
## Multiple R-squared: 0.9993, Adjusted R-squared: 0.9993
## F-statistic: 9.055e+06 on 2 and 11997 DF, p-value: < 2.2e-16
m1 <- arm::standardize(m1)
summary(m1)
##
## Call:
## lm(formula = n_publications ~ z.total_grants + z.data_grant_share,
## data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -263.678 -11.631 0.196 11.898 215.479
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 614.4292 0.2207 2784.154 <2e-16 ***
## z.total_grants 1878.4223 0.4414 4255.650 <2e-16 ***
## z.data_grant_share -4.0994 0.4414 -9.287 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.18 on 11997 degrees of freedom
## Multiple R-squared: 0.9993, Adjusted R-squared: 0.9993
## F-statistic: 9.055e+06 on 2 and 11997 DF, p-value: < 2.2e-16
m2 <- lm(n_publications ~ total_grants + data_grant_share +
I(total_grants^2),
data = df_clean)
summary(m2)
##
## Call:
## lm(formula = n_publications ~ total_grants + data_grant_share +
## I(total_grants^2), data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -280.413 -11.818 -0.226 11.676 212.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.532e+02 7.473e-01 205.062 <2e-16 ***
## total_grants 5.819e+00 3.962e-03 1468.861 <2e-16 ***
## data_grant_share -1.299e+01 1.391e+00 -9.342 <2e-16 ***
## I(total_grants^2) 6.652e-05 5.655e-06 11.764 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.04 on 11996 degrees of freedom
## Multiple R-squared: 0.9993, Adjusted R-squared: 0.9993
## F-statistic: 6.106e+06 on 3 and 11996 DF, p-value: < 2.2e-16
m2 <- arm::standardize(m2)
summary(m2)
##
## Call:
## lm(formula = n_publications ~ z.total_grants + z.data_grant_share +
## I(z.total_grants^2), data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -280.413 -11.818 -0.226 11.676 212.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 612.7223 0.2631 2329.144 <2e-16 ***
## z.total_grants 1867.8216 1.0023 1863.498 <2e-16 ***
## z.data_grant_share -4.1002 0.4389 -9.342 <2e-16 ***
## I(z.total_grants^2) 6.8281 0.5804 11.764 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.04 on 11996 degrees of freedom
## Multiple R-squared: 0.9993, Adjusted R-squared: 0.9993
## F-statistic: 6.106e+06 on 3 and 11996 DF, p-value: < 2.2e-16
# m2 fits better, but has too high VIF
car::vif(m2)
## z.total_grants z.data_grant_share I(z.total_grants^2)
## 5.215605 1.000003 5.215601
# so we stick with model 1
# however, there might be an interaction between the share of grants and the
# total number of grants
m3 <- lm(n_publications ~ total_grants*data_grant_share,
data = df_clean)
m3 <- arm::standardize(m3)
summary(m3)
##
## Call:
## lm(formula = n_publications ~ z.total_grants * z.data_grant_share,
## data = df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -256.593 -11.406 0.112 11.768 219.490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 614.3644 0.2175 2825.01 <2e-16 ***
## z.total_grants 1877.9822 0.4355 4311.96 <2e-16 ***
## z.data_grant_share -36.2906 1.7486 -20.75 <2e-16 ***
## z.total_grants:z.data_grant_share -150.3289 7.9089 -19.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.82 on 11996 degrees of freedom
## Multiple R-squared: 0.9994, Adjusted R-squared: 0.9994
## F-statistic: 6.218e+06 on 3 and 11996 DF, p-value: < 2.2e-16
AIC(m3, m1)
## df AIC
## m3 5 110153.3
## m1 4 110507.3
# m3 is clearly better
df_nested <- df_clean %>%
group_by(pubs.vs.data) %>%
nest()
modeling_fun <- function(df) {
m <- lm(n_publications ~ total_grants*data_grant_share,
data = df)
m
}
df_nested <- df_nested %>%
mutate(model = map(data, modeling_fun),
glance = map(model, broom::glance),
tidy = map(model, broom::tidy, conf.int = TRUE))
df_nested
## # A tibble: 6 x 5
## # Groups: pubs.vs.data [6]
## pubs.vs.data data model glance tidy
## <dbl> <list> <list> <list> <list>
## 1 0 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 2 0.4 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 3 0.2 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 4 1 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 5 0.8 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
## 6 0.6 <tibble [2,000 x 8]> <lm> <tibble [1 x 12]> <tibble [4 x 7]>
df_nested %>%
unnest(glance)
## # A tibble: 6 x 16
## # Groups: pubs.vs.data [6]
## pubs.vs.data data model r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance
## <dbl> <list> <lis> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 <tibble [2,000 ~ <lm> 0.999 0.999 23.9 598222. 0 3 -9187. 18384. 18412. 1143973.
## 2 0.4 <tibble [2,000 ~ <lm> 0.999 0.999 23.0 1146585. 0 3 -9109. 18228. 18256. 1058188.
## 3 0.2 <tibble [2,000 ~ <lm> 0.999 0.999 23.0 925979. 0 3 -9105. 18221. 18249. 1054267.
## 4 1 <tibble [2,000 ~ <lm> 0.999 0.999 24.4 1187276. 0 3 -9229. 18468. 18496. 1193081.
## 5 0.8 <tibble [2,000 ~ <lm> 0.999 0.999 23.0 1273910. 0 3 -9104. 18219. 18247. 1053287.
## 6 0.6 <tibble [2,000 ~ <lm> 0.999 0.999 24.6 1167053. 0 3 -9242. 18494. 18522. 1208455.
## # ... with 3 more variables: df.residual <int>, nobs <int>, tidy <list>
# plot it
dummy_data <- tibble(
total_grants = rep(1:50, 5),
data_grant_share = rep(c(0, .25, .5, .75, 1), each = 50)
)
df_nested %>%
mutate(new_data = list(dummy_data)) %>%
mutate(prediction = map2(model, new_data, ~broom::augment(.x, newdata = .y))) %>%
unnest(prediction) %>%
ggplot(aes(total_grants, .fitted, colour = as.factor(data_grant_share))) +
geom_line() +
geom_point() +
facet_wrap(vars(pubs.vs.data)) +
labs(y = "Predicted number of publications",
colour = "% of grants mandating data sharing",
title = "Productivity depending on data sharing incentives") +
theme(legend.position = "top")
As you incentivise data sharing this way, those groups that actually solely get funded by funders mandating OD fare worse than others.
explanation for the lower inequality in the system: actors get disadvantaged and therefore randomness takes a larger share of the system
important to note: early on there seems to be an advantage, but it quickly goes away and turns into a disadvantage.
so the interesting thing could be to give agents strategies (always share data if last n grants had data sharing, so to have a longer term effect, vs maybe also simply directly adhering), and then to see which share of funders mandating data we need to see a change.
also, maybe having those that always share data, and seeing how they are affected